In this analysis module, we have annotated cell types in
SCPCP000004 samples using two methods: SingleR
and scANVI/scArches, both using the NBAtlas dataset as a
reference. In this notebook, we derive the final annotations using
information from the SingleR labels, the
scANVI/scArches labels, and existing consensus cell type
annotations. All final annotations use labels present in the NBAtlas
dataset.
We use the following approach to determine the final annotation.
Throughout, we use CL ontology ids to perform comparisons. For more
information about how these ontology ids were determined for NBAtlas
labels, see ../references/README.md.
- If
SingleR and scANVI/scArches labels
exactly agree, we assign that label
- If broad label families (e.g.,
T cell would be the
broad family for CD4+ T cell) for SingleR and
scANVI/scArches labels agree, we assign the broad family
label.
- If
SingleR and scANVI/scArches labels
disagree, we then compare to the consensus cell type; if at least one
inference agrees with the consensus cell type, we assign the inferred
label.
- We perform this first at the label level and then at the broad
family level.
- Finally, following the conclusions of Bonine et al. (2024), we
designate any cell labeled as
Neuroendocrine as a tumor
cell.
Setup
suppressPackageStartupMessages({
library(ggplot2)
library(patchwork)
library(SingleCellExperiment)
})
theme_set(
theme_bw()
)
umap_theme <- list(
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = "bottom",
aspect.ratio = 1
)
)
# settings
options(
dplyr.summarise.inform = FALSE,
readr.show_col_types = FALSE
)
Paths
module_dir <- rprojroot::find_root(rprojroot::is_renv_project)
repository_base <- file.path(module_dir, "..", "..")
data_dir <- file.path(repository_base, "data", "current", "SCPCP000004")
merged_dir <- file.path(repository_base, "data", "current", "results", "merge-sce", "SCPCP000004")
ref_dir <- file.path(module_dir, "references")
results_dir <- file.path(module_dir, "results")
singler_dir <- file.path(results_dir, "singler")
scanvi_dir <- file.path(results_dir, "scanvi")
# merged SCE file
sce_file <- file.path(
merged_dir,
"SCPCP000004_merged.rds"
)
# SingleR files
singler_files <- list.files(
path = singler_dir,
pattern = "_singler-annotations\\.tsv",
recursive = TRUE,
full.names = TRUE
) |>
# add names as library id
purrr::set_names(
\(x) {
stringr::str_remove(basename(x), "_singler-annotations.tsv")
}
)
# scanvi predictions
scanvi_file <- file.path(
scanvi_dir,
"scanvi-predictions.tsv"
)
# palette file
palette_file <- file.path(
module_dir,
"palettes",
"nbatlas-cell-type-palette.tsv"
)
# label map file
nbatlas_label_map_file <- file.path(
ref_dir,
"nbatlas-label-map.tsv"
)
# label ontologies file
nbatlas_ontology_file <- file.path(
ref_dir,
"nbatlas-ontology-ids.tsv"
)
# marker genes from NBAtlas
nbatlas_markers_file <- file.path(
ref_dir,
"nbatlas-marker-genes.tsv"
)
# validation groups URL, used to obtain consensus "family" ontologies (aka broad validation group ontologies)
validation_url <- "https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-consensus/references/consensus-validation-groups.tsv"
Functions
# Source utilities functions:
# - subset_nbatlas_markers()
# - faceted_umap()
# - generate_dotplot()
source(file.path(module_dir, "scripts", "utils", "celltype-utils.R"))
#' Prepare data frame of scANVI or SingleR labels for annotation
#'
#' @param df Data frame to prepare
#' @param annot_type "singler" or "scanvi"
#' @param ontology_df Data frame of ontology ids
#' @param label_map_df Data frame mapping labels to family labels
#'
#' @returns Wide data frame with ontologies and family labels for the given annotation type
prep_for_annotation <- function(
df,
annot_type,
ontology_df,
label_map_df) {
df |>
dplyr::select(all_of(c("cell_id", annot_type))) |>
dplyr::rename(label = annot_type) |>
####### Join in the family labels
dplyr::left_join(label_map_df, by = c("label" = "NBAtlas_label")) |>
dplyr::rename(family = NBAtlas_family) |>
######### Obtain LABEL ontologies
dplyr::left_join(ontology_slim_df, by = c("label" = "NBAtlas_label")) |>
dplyr::rename(label_ontology = CL_ontology_id) |>
######## Obtain FAMILY ontologies
dplyr::left_join(ontology_slim_df, by = c("family" = "NBAtlas_label")) |>
dplyr::rename(family_ontology = CL_ontology_id) |>
# rename columns to start with `annot_type`
dplyr::rename_with(\(x) {paste(annot_type, x, sep = "_")}, -cell_id)
}
Read and prepare input data
Read merged SCE object:
merged_sce <- readRDS(sce_file)
# immediately remove assays we don't need for space
assay(merged_sce, "spliced") <- NULL
assay(merged_sce, "counts") <- NULL
reducedDim(merged_sce, "PCA") <- NULL
Read SingleR results:
singler_df <- singler_files |>
purrr::map(readr::read_tsv) |>
purrr::list_rbind(names_to = "library_id") |>
dplyr::mutate(
# add cell id column for unique rows & joining
cell_id = glue::glue("{library_id}-{barcodes}"),
# recode NA -> "Unknown" and NE -> "Neuroendocrine"
singler = dplyr::case_when(
is.na(pruned.labels) ~ "Unknown",
pruned.labels == "NE" ~ "Neuroendocrine",
.default = pruned.labels)
) |>
dplyr::select(cell_id, singler)
Read scANVI results:
scanvi_df <- readr::read_tsv(
file.path(scanvi_dir, "scanvi_predictions.tsv")
) |>
dplyr::select(
cell_id,
scanvi = scanvi_prediction,
starts_with("pp_")
) |>
tidyr::pivot_longer(
starts_with("pp_"),
names_to = "posterior_celltype",
values_to = "posterior"
) |>
dplyr::mutate(posterior_celltype = stringr::str_remove(posterior_celltype, "^pp_")) |>
dplyr::filter(scanvi == posterior_celltype) |>
# recode to Unknown if below the threshold, and NE -> Neuroendocrine
dplyr::mutate(scanvi = dplyr::case_when(
posterior < params$scanvi_pp_threshold ~ "Unknown",
scanvi == "NE" ~ "Neuroendocrine",
.default = scanvi)) |>
dplyr::select(cell_id, scanvi)
Read additional helper files:
palette_df <- readr::read_tsv(palette_file)
celltype_pal <- palette_df$hex_color
names(celltype_pal) <- palette_df$NBAtlas_label
label_map_df <- readr::read_tsv(nbatlas_label_map_file)
ontology_df <- readr::read_tsv(nbatlas_ontology_file)
nbatlas_markers_df <- readr::read_tsv(nbatlas_markers_file)
validation_df <- readr::read_tsv(validation_url) |>
dplyr::rename(
consensus = consensus_annotation,
consensus_family_label = validation_group_annotation,
consensus_family_ontology = validation_group_ontology
)
Combine annotations into a single data frame:
celltype_df <- scuttle::makePerCellDF(
merged_sce,
use.coldata = c("sample_id", "is_xenograft", "consensus_celltype_annotation", "consensus_celltype_ontology")
) |>
tibble::rownames_to_column("cell_id") |>
tidyr::separate(cell_id, into = c("library_id", "cell_barcode"), remove = FALSE) |>
# keep UMAP for eventual viz
dplyr::rename(
consensus = consensus_celltype_annotation,
consensus_ontology = consensus_celltype_ontology,
UMAP1 = UMAP.1,
UMAP2 = UMAP.2
) |>
dplyr::left_join(singler_df, by = "cell_id") |>
dplyr::left_join(scanvi_df, by = "cell_id")
# Pull out metadata we'll want later
sample_metadata <- celltype_df |>
dplyr::select(sample_id, library_id, is_xenograft) |>
unique()
Annotate
Firt we prepare the data frame for annotation.
ontology_slim_df <- ontology_df |>
dplyr::select(-CL_annotation)
singler_annotation_df <- prep_for_annotation(
celltype_df,
"singler",
ontology_slim_df,
label_map_df
)
scanvi_annotation_df <- prep_for_annotation(
celltype_df,
"scanvi",
ontology_slim_df,
label_map_df
)
annotation_df <- celltype_df |>
dplyr::select(cell_id, consensus, consensus_ontology) |>
dplyr::left_join(singler_annotation_df, by = "cell_id") |>
dplyr::left_join(scanvi_annotation_df, by = "cell_id") |>
dplyr::left_join(validation_df, by = c("consensus", "consensus_ontology")) |>
dplyr::select(cell_id, starts_with("consensus"), starts_with("singler"), starts_with("scanvi"))
In the code below, we compare ontology ids to derive final
annotations. However, based on those comparisons, we assign a final
label, not final ontology id, and then add the final ontology
ids back into those annotations. This is because we have several
NA ontology ids, and we would not be able to unambiguously
map their labels in if we assigned ontologies.
final_annotation_df <- annotation_df |>
dplyr::mutate(
# For all comparisons, make sure we aren't comparing NA to NA and getting TRUE
final_label = dplyr::case_when(
########### Check for exact match between SingleR/scANVI
!is.na(scanvi_label_ontology) & singler_label_ontology == scanvi_label_ontology ~ scanvi_label,
########### Check for family match between SingleR/scANVI
!is.na(scanvi_family_ontology) & singler_family_ontology == scanvi_family_ontology ~ scanvi_family,
########## Now use agreement with consensus to assign a label: first by label, and then by family
!is.na(consensus_ontology) & singler_label_ontology == consensus_ontology ~ singler_label,
!is.na(consensus_ontology) & scanvi_label_ontology == consensus_ontology ~ scanvi_label,
!is.na(consensus_family_ontology) & singler_family_ontology == consensus_family_ontology ~ singler_family,
!is.na(consensus_family_ontology) & scanvi_family_ontology == consensus_family_ontology ~ scanvi_family,
# Everything else is Unknown
.default = "Unknown"
)
) |>
# Now, we can bring map the final ontologies into the data frame
dplyr::inner_join(
ontology_df |> dplyr::rename(final_label = NBAtlas_label, final_id = CL_ontology_id),
by = "final_label"
) |>
# add tumor column for visualization
dplyr::mutate(cell_class = dplyr::case_when(
final_label == "Neuroendocrine" ~ "tumor",
final_label == "Unknown" ~ "unknown",
.default = "normal"
))
Explore final annotations
What fraction of cells have been annotated?
sum(!is.na(final_annotation_df$final_id)) / nrow(final_annotation_df)
[1] 0.8506399
What fraction of cells have been annotated per library?
frac_labeled_df <- final_annotation_df |>
tidyr::separate(cell_id, into = c("library_id", "barcode"), remove = FALSE) |>
dplyr::group_by(library_id) |>
dplyr::summarize(
frac_labeled = sum(!is.na(final_id))/dplyr::n(),
library_size = dplyr::n()
) |>
# get PDX logical for coloring
dplyr::inner_join(sample_metadata, by = "library_id")
p1 <- ggplot(frac_labeled_df) +
aes(x = frac_labeled) +
geom_histogram(bins = 40) +
labs(x = "Fraction of library labeled") +
theme_bw()
p2 <- ggplot(frac_labeled_df) +
aes(x = library_size, y = frac_labeled, color = is_xenograft) +
geom_point() +
labs(
x = "Total cells in library",
y = "Fraction of library labeled"
) +
theme_bw() +
theme(legend.position = "bottom")
p1 + p2

UMAPs
In this section, we show the UMAP from the merged (not
batch-corrected!) SCPCP000004 object colored by cell type
annotations and other potentially relevant metadata.
label_order <- label_map_df |>
dplyr::add_count(NBAtlas_family) |>
dplyr::arrange(desc(n), NBAtlas_family) |>
dplyr::filter(NBAtlas_label %in% final_annotation_df$final_label) |>
dplyr::pull(NBAtlas_label)
label_order <- c(label_order, "Unknown")
# prepare data frame for plotting
umap_df <- celltype_df |>
dplyr::select(cell_id, sample_id, library_id, is_xenograft, UMAP1, UMAP2) |>
dplyr::left_join(final_annotation_df, by = "cell_id") |>
dplyr::mutate(final_label = factor(final_label, levels = label_order))
Colored by cell type assignment
Note that in the UMAP below, cell types in these same “family” each
share a color:
- T cells
- Natural killer cells
- Conventional dendritic cells
- Monocytes
ggplot(umap_df) +
aes(x = UMAP1, y = UMAP2, color = final_label) +
geom_point(alpha = 0.2, size = 0.3) +
scale_color_manual(values = celltype_pal) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) +
umap_theme +
theme(legend.position = "bottom")

Faceted by cell type assignment
This UMAP is the same as the previous one, except each facet
highlights one specific cell type. In this plot, only cell types with at
least 10 cells are shown.
umap_df |>
dplyr::add_count(final_label) |>
dplyr::filter(n > 10) |>
faceted_umap(
annotation_column = final_label,
celltype_colors = celltype_pal,
facet_wrap_cols = 6
) +
umap_theme +
theme(legend.position = "none")

Colored by tumor classification
ggplot(umap_df) +
aes(x = UMAP1, y = UMAP2, color = cell_class) +
geom_point(alpha = 0.2, size = 0.3) +
scale_color_brewer(palette = "Set2") +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) +
umap_theme

Colored by PDX status
ggplot(umap_df) +
aes(x = UMAP1, y = UMAP2, color = is_xenograft) +
geom_point(alpha = 0.2, size = 0.3) +
scale_color_brewer(palette = "Set1") +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) +
umap_theme

Dot plot
This section shows dot plots of marker gene expression for the final
annotations. Specifically, we show the top 5 highest log2FC
marker genes per NBAtlas cell type for validation. Marker genes are
taken directly from the NBAtlas paper where they were identified with
Seurat::FindMarkers().
Marker genes that are present in three or more cell types are not
included. In addition, the upper limit for the color scale has been
censored to a maximum of 6 to ensure visibility of expression levels
across all cell types.
input_dotplot_df <- final_annotation_df |>
dplyr::filter(final_label != "Unknown") |>
# recode cell types to match what is present in NBAtlas marker genes
dplyr::mutate(
label_recoded = dplyr::case_when(
stringr::str_detect(final_label, "monocyte") ~ "Monocyte",
stringr::str_detect(final_label, "cDC") ~ "cDC",
.default = final_label
)) |>
dplyr::select(cell_id, label_recoded)
# data frame of top 5 upregulated genes per cell type
top_nbatlas_markers_df <- subset_nbatlas_markers(nbatlas_markers_df, 5, "up") |>
# only markers in no more than 2 categories
# this is a good middle ground to ensure there are at least 2 markers per cell type
dplyr::add_count(ensembl_gene_id) |>
dplyr::filter(n < 3) |>
dplyr::select(-n)
# get total number of cells per final annotation group and set up y_label
total_cells_df <- input_dotplot_df |>
dplyr::count(label_recoded, name = "total_cells") |>
dplyr::mutate(y_label = glue::glue("{label_recoded} ({total_cells})")) |>
dplyr::inner_join(label_map_df, by = c("label_recoded" = "NBAtlas_label")) |>
dplyr::group_by(NBAtlas_family) |>
dplyr::mutate(family_total_cells = sum(total_cells)) |>
# order by family count, but put Unknown at the end
dplyr::arrange(desc(family_total_cells)) |>
# amazing new dplyr trick; only this row is TRUE, so it gets moved to the last row
dplyr::arrange(label_recoded == "Unknown")
total_cells_df$y_label <- factor(total_cells_df$y_label, levels = rev(total_cells_df$y_label))
nbatlas_bar_order <- total_cells_df$label_recoded
# Prepare the expressed_genes vector
# we only care about if that gene is expressed otherwise we won't waste memory and include it
gene_sums <- rowData(merged_sce) |>
as.data.frame() |>
dplyr::select(contains("detected")) |>
as.matrix() |>
rowSums()
expressed_genes <- names(gene_sums)[gene_sums > 0]
generate_dotplot(
merged_sce = merged_sce,
markers_df = top_nbatlas_markers_df,
celltype_df = input_dotplot_df,
total_cells_df = total_cells_df,
expressed_genes = expressed_genes,
bar_order = nbatlas_bar_order,
celltype_palette = celltype_pal,
min_cells = 10 # only show cell types with at least 10 cells
) +
theme(legend.position = "bottom")

Export
Export the final table with submission-ready column names.
final_annotation_df |>
dplyr::select(
cell_id,
cell_type_assignment = final_label,
tumor_cell_classification = cell_class,
CL_ontology_id = final_id,
CL_annotation
) |>
readr::write_tsv(
params$annotations_tsv
)
Session Info
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: aarch64-apple-darwin20
Running under: macOS 15.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods
[8] base
other attached packages:
[1] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
[3] Biobase_2.64.0 GenomicRanges_1.56.2
[5] GenomeInfoDb_1.40.1 IRanges_2.38.1
[7] S4Vectors_0.42.1 BiocGenerics_0.50.0
[9] MatrixGenerics_1.16.0 matrixStats_1.5.0
[11] patchwork_1.3.0 ggplot2_3.5.2
loaded via a namespace (and not attached):
[1] gtable_0.3.6 xfun_0.52
[3] bslib_0.9.0 lattice_0.22-6
[5] tzdb_0.5.0 bitops_1.0-9
[7] vctrs_0.6.5 tools_4.4.0
[9] generics_0.1.4 curl_6.3.0
[11] parallel_4.4.0 tibble_3.3.0
[13] pkgconfig_2.0.3 Matrix_1.7-1
[15] RColorBrewer_1.1-3 sparseMatrixStats_1.16.0
[17] lifecycle_1.0.4 GenomeInfoDbData_1.2.12
[19] compiler_4.4.0 farver_2.1.2
[21] stringr_1.5.1 codetools_0.2-20
[23] htmltools_0.5.8.1 sass_0.4.10
[25] yaml_2.3.10 pillar_1.10.2
[27] crayon_1.5.3 jquerylib_0.1.4
[29] tidyr_1.3.1 BiocParallel_1.38.0
[31] DelayedArray_0.30.1 cachem_1.1.0
[33] abind_1.4-8 tidyselect_1.2.1
[35] digest_0.6.37 stringi_1.8.7
[37] dplyr_1.1.4 purrr_1.0.4
[39] labeling_0.4.3 rprojroot_2.0.4
[41] fastmap_1.2.0 grid_4.4.0
[43] cli_3.6.5 SparseArray_1.4.8
[45] magrittr_2.0.3 S4Arrays_1.4.1
[47] readr_2.1.5 withr_3.0.2
[49] DelayedMatrixStats_1.26.0 scales_1.4.0
[51] UCSC.utils_1.0.0 bit64_4.6.0-1
[53] rmarkdown_2.29 XVector_0.44.0
[55] httr_1.4.7 jpeg_0.1-11
[57] ggmap_4.0.1 bit_4.6.0
[59] png_0.1-8 hms_1.1.3
[61] beachmat_2.20.0 evaluate_1.0.3
[63] knitr_1.50 viridisLite_0.4.2
[65] rlang_1.1.6 Rcpp_1.0.14
[67] scuttle_1.14.0 glue_1.8.0
[69] BiocManager_1.30.26 renv_1.1.3
[71] vroom_1.6.5 jsonlite_2.0.0
[73] plyr_1.8.9 R6_2.6.1
[75] zlibbioc_1.50.0
---
title: "Derive final annotations based on NBAtlas reference"
author: "Stephanie J. Spielman"
date: "`r Sys.Date()`"
output:
  html_notebook:
    toc: true
    toc_depth: 3
    code_folding: hide
params:
  scanvi_pp_threshold: 0.75    
  annotations_tsv: "results/SCPCP000004-annotations.tsv"
  testing: FALSE
---

In this analysis module, we have annotated cell types in `SCPCP000004` samples using two methods: `SingleR` and `scANVI/scArches`, both using the NBAtlas dataset as a reference.
In this notebook, we derive the final annotations using information from the `SingleR` labels, the `scANVI/scArches` labels, and existing consensus cell type annotations.
All final annotations use labels present in the NBAtlas dataset.

We use the following approach to determine the final annotation.
Throughout, we use CL ontology ids to perform comparisons. 
For more information about how these ontology ids were determined for NBAtlas labels, see `../references/README.md`.

1. If `SingleR` and `scANVI/scArches` labels exactly agree, we assign that label
2. If broad label families (e.g., `T cell` would be the broad family for `CD4+ T cell`) for `SingleR` and `scANVI/scArches` labels agree, we assign the broad family label.
  * See [`references/nbatlas-label-map.tsv`](references/nbatlas-label-map.tsv) and [`references/README.md`](references/README.md) for how labels were mapped to families.
3. If `SingleR` and `scANVI/scArches` labels disagree, we then compare to the consensus cell type; if at least one inference agrees with the consensus cell type, we assign the inferred label.
  * We perform this first at the label level and then at the broad family level.
4. Finally, following the conclusions of Bonine et al. (2024), we designate any cell labeled as `Neuroendocrine` as a tumor cell.


## Setup

```{r, warning = FALSE}
suppressPackageStartupMessages({
  library(ggplot2)
  library(patchwork)
  library(SingleCellExperiment)
})

theme_set(
  theme_bw()
)
umap_theme <- list(
  theme(
    axis.text = element_blank(), 
    axis.ticks = element_blank(), 
    legend.position = "bottom", 
    aspect.ratio = 1
  )
)

# settings
options(
  dplyr.summarise.inform = FALSE, 
  readr.show_col_types = FALSE
)
```

### Paths

```{r base paths}
module_dir <- rprojroot::find_root(rprojroot::is_renv_project)
repository_base <- file.path(module_dir, "..", "..")

data_dir <- file.path(repository_base, "data", "current", "SCPCP000004")
merged_dir <- file.path(repository_base, "data", "current", "results", "merge-sce", "SCPCP000004")
ref_dir <- file.path(module_dir, "references")
results_dir <- file.path(module_dir, "results")
singler_dir <- file.path(results_dir, "singler")
scanvi_dir <- file.path(results_dir, "scanvi")
```

```{r file paths}
# merged SCE file
sce_file <- file.path(
  merged_dir,
  "SCPCP000004_merged.rds"
)

# SingleR files
singler_files <- list.files(
  path = singler_dir,
  pattern = "_singler-annotations\\.tsv",
  recursive = TRUE,
  full.names = TRUE
) |>
  # add names as library id
  purrr::set_names(
    \(x) {
      stringr::str_remove(basename(x), "_singler-annotations.tsv")
    }
  )

# scanvi predictions
scanvi_file <- file.path(
  scanvi_dir, 
  "scanvi-predictions.tsv"
)

# palette file
palette_file <- file.path(
  module_dir,
  "palettes",
  "nbatlas-cell-type-palette.tsv"
)

# label map file
nbatlas_label_map_file <- file.path(
  ref_dir, 
  "nbatlas-label-map.tsv"
)

# label ontologies file
nbatlas_ontology_file <- file.path(
  ref_dir, 
  "nbatlas-ontology-ids.tsv"
)

# marker genes from NBAtlas
nbatlas_markers_file <- file.path(
  ref_dir, 
  "nbatlas-marker-genes.tsv"
)

# validation groups URL, used to obtain consensus "family" ontologies (aka broad validation group ontologies)
validation_url <- "https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-consensus/references/consensus-validation-groups.tsv"
```


### Functions

```{r}
# Source utilities functions:
# - subset_nbatlas_markers()
# - faceted_umap()
# - generate_dotplot()
source(file.path(module_dir, "scripts", "utils", "celltype-utils.R"))
```


```{r}

#' Prepare data frame of scANVI or SingleR labels for annotation
#'
#' @param df Data frame to prepare
#' @param annot_type "singler" or "scanvi"
#' @param ontology_df Data frame of ontology ids
#' @param label_map_df Data frame mapping labels to family labels
#'
#' @returns Wide data frame with ontologies and family labels for the given annotation type
prep_for_annotation <- function(
    df, 
    annot_type, 
    ontology_df, 
    label_map_df) {

  df |>
    dplyr::select(all_of(c("cell_id", annot_type))) |>
    dplyr::rename(label = annot_type) |>
    ####### Join in the family labels
    dplyr::left_join(label_map_df, by = c("label" = "NBAtlas_label")) |>
    dplyr::rename(family = NBAtlas_family) |>
    ######### Obtain LABEL ontologies
    dplyr::left_join(ontology_slim_df, by = c("label" = "NBAtlas_label")) |>
    dplyr::rename(label_ontology = CL_ontology_id) |>
    ######## Obtain FAMILY ontologies
    dplyr::left_join(ontology_slim_df, by = c("family" = "NBAtlas_label")) |>
    dplyr::rename(family_ontology = CL_ontology_id) |>
    # rename columns to start with `annot_type`
    dplyr::rename_with(\(x) {paste(annot_type, x, sep = "_")}, -cell_id) 
}
```


### Read and prepare input data

Read merged SCE object:

```{r}
merged_sce <- readRDS(sce_file)
# immediately remove assays we don't need for space
assay(merged_sce, "spliced") <- NULL
assay(merged_sce, "counts") <- NULL
reducedDim(merged_sce, "PCA") <- NULL
```

Read `SingleR` results:

```{r}
singler_df <- singler_files |>
  purrr::map(readr::read_tsv) |>
  purrr::list_rbind(names_to = "library_id") |>
  dplyr::mutate(
    # add cell id column for unique rows & joining
    cell_id = glue::glue("{library_id}-{barcodes}"),
    # recode NA -> "Unknown" and NE -> "Neuroendocrine"
    singler = dplyr::case_when(
      is.na(pruned.labels) ~ "Unknown", 
      pruned.labels == "NE" ~ "Neuroendocrine",
      .default = pruned.labels)
  ) |>
  dplyr::select(cell_id, singler)
```

Read `scANVI` results:

```{r}
scanvi_df <- readr::read_tsv(
  file.path(scanvi_dir, "scanvi_predictions.tsv")
) |>
  dplyr::select(
    cell_id, 
    scanvi = scanvi_prediction, 
    starts_with("pp_")
  ) |>
  tidyr::pivot_longer(
    starts_with("pp_"), 
    names_to = "posterior_celltype", 
    values_to = "posterior"
  ) |>
  dplyr::mutate(posterior_celltype = stringr::str_remove(posterior_celltype, "^pp_")) |>
  dplyr::filter(scanvi == posterior_celltype) |>
  # recode to Unknown if below the threshold, and NE -> Neuroendocrine
  dplyr::mutate(scanvi = dplyr::case_when(
    posterior < params$scanvi_pp_threshold ~ "Unknown", 
    scanvi == "NE" ~ "Neuroendocrine", 
    .default = scanvi)) |>
  dplyr::select(cell_id, scanvi)
```

Read additional helper files:

```{r}
palette_df <- readr::read_tsv(palette_file)
celltype_pal <- palette_df$hex_color
names(celltype_pal) <- palette_df$NBAtlas_label

label_map_df <- readr::read_tsv(nbatlas_label_map_file)
ontology_df <- readr::read_tsv(nbatlas_ontology_file)

nbatlas_markers_df <- readr::read_tsv(nbatlas_markers_file)

validation_df <- readr::read_tsv(validation_url) |>
  dplyr::rename(
    consensus = consensus_annotation,
    consensus_family_label = validation_group_annotation,
    consensus_family_ontology = validation_group_ontology 
  )
```


Combine annotations into a single data frame:

```{r}
celltype_df <- scuttle::makePerCellDF(
  merged_sce,
  use.coldata = c("sample_id", "is_xenograft", "consensus_celltype_annotation", "consensus_celltype_ontology")
) |>
  tibble::rownames_to_column("cell_id") |>
  tidyr::separate(cell_id, into = c("library_id", "cell_barcode"), remove = FALSE) |>
  # keep UMAP for eventual viz
  dplyr::rename(
    consensus = consensus_celltype_annotation, 
    consensus_ontology = consensus_celltype_ontology,
    UMAP1 = UMAP.1, 
    UMAP2 = UMAP.2
  ) |>
  dplyr::left_join(singler_df, by = "cell_id") |>
  dplyr::left_join(scanvi_df, by = "cell_id")


# Pull out metadata we'll want later
sample_metadata <- celltype_df |>
  dplyr::select(sample_id, library_id, is_xenograft) |>
  unique()
```


## Annotate

Firt we prepare the data frame for annotation.

```{r, warning = FALSE}
ontology_slim_df <- ontology_df |>
  dplyr::select(-CL_annotation)

singler_annotation_df <- prep_for_annotation(
  celltype_df, 
  "singler", 
  ontology_slim_df, 
  label_map_df
)
scanvi_annotation_df <- prep_for_annotation(
  celltype_df, 
  "scanvi", 
  ontology_slim_df, 
  label_map_df
)

annotation_df <- celltype_df |>
  dplyr::select(cell_id, consensus, consensus_ontology) |>
  dplyr::left_join(singler_annotation_df, by = "cell_id") |>
  dplyr::left_join(scanvi_annotation_df, by = "cell_id") |>
  dplyr::left_join(validation_df, by = c("consensus", "consensus_ontology")) |>
  dplyr::select(cell_id, starts_with("consensus"), starts_with("singler"), starts_with("scanvi"))
```


In the code below, we compare ontology ids to derive final annotations.
However, based on those comparisons, we assign a final _label_, not final ontology id, and then add the final ontology ids back into those annotations.
This is because we have several `NA` ontology ids, and we would not be able to unambiguously map their labels in if we assigned ontologies.

```{r}
final_annotation_df <- annotation_df |> 
  dplyr::mutate(
    # For all comparisons, make sure we aren't comparing NA to NA and getting TRUE
    final_label = dplyr::case_when(
      ########### Check for exact match between SingleR/scANVI
      !is.na(scanvi_label_ontology) & singler_label_ontology == scanvi_label_ontology ~ scanvi_label, 
      ########### Check for family match between SingleR/scANVI
      !is.na(scanvi_family_ontology) & singler_family_ontology == scanvi_family_ontology ~ scanvi_family,
      ########## Now use agreement with consensus to assign a label: first by label, and then by family
      !is.na(consensus_ontology) & singler_label_ontology  == consensus_ontology ~ singler_label,
      !is.na(consensus_ontology) & scanvi_label_ontology   == consensus_ontology ~ scanvi_label,
      !is.na(consensus_family_ontology) & singler_family_ontology == consensus_family_ontology ~ singler_family,
      !is.na(consensus_family_ontology) & scanvi_family_ontology  == consensus_family_ontology ~ scanvi_family,
      # Everything else is Unknown
      .default = "Unknown"
    )
  ) |>
  # Now, we can bring map the final ontologies into the data frame
  dplyr::inner_join(
    ontology_df |> dplyr::rename(final_label = NBAtlas_label, final_id = CL_ontology_id), 
    by = "final_label"
  ) |>
  # add tumor column for visualization
  dplyr::mutate(cell_class = dplyr::case_when(
    final_label == "Neuroendocrine" ~ "tumor", 
    final_label == "Unknown" ~ "unknown", 
    .default = "normal"
  ))
``` 


## Explore final annotations

What fraction of cells have been annotated?


```{r}
sum(!is.na(final_annotation_df$final_id)) / nrow(final_annotation_df)
```


What fraction of cells have been annotated per library?

```{r, fig.width = 8, fig.height = 4}
frac_labeled_df <- final_annotation_df |>
  tidyr::separate(cell_id, into = c("library_id", "barcode"), remove = FALSE) |>
  dplyr::group_by(library_id) |>
  dplyr::summarize(
    frac_labeled = sum(!is.na(final_id))/dplyr::n(), 
    library_size = dplyr::n()
  ) |>
  # get PDX logical for coloring
  dplyr::inner_join(sample_metadata, by = "library_id")


p1 <- ggplot(frac_labeled_df) + 
  aes(x = frac_labeled) + 
  geom_histogram(bins = 40) +
  labs(x = "Fraction of library labeled") +
  theme_bw()


p2 <- ggplot(frac_labeled_df) + 
  aes(x = library_size, y = frac_labeled, color = is_xenograft) + 
  geom_point() + 
  labs(
    x = "Total cells in library",
    y = "Fraction of library labeled"
  ) +
  theme_bw() +
  theme(legend.position = "bottom")


p1 + p2
```

### UMAPs

In this section, we show the UMAP from the merged (not batch-corrected!) `SCPCP000004` object colored by cell type annotations and other potentially relevant metadata.


```{r}
label_order <- label_map_df |>
  dplyr::add_count(NBAtlas_family) |>
  dplyr::arrange(desc(n), NBAtlas_family) |>
  dplyr::filter(NBAtlas_label %in% final_annotation_df$final_label) |>
  dplyr::pull(NBAtlas_label)
label_order <- c(label_order, "Unknown")

# prepare data frame for plotting
umap_df <- celltype_df |>
  dplyr::select(cell_id, sample_id, library_id, is_xenograft, UMAP1, UMAP2) |>
  dplyr::left_join(final_annotation_df, by = "cell_id") |>
  dplyr::mutate(final_label = factor(final_label, levels = label_order))
```


#### Colored by cell type assignment

Note that in the UMAP below, cell types in these same "family" each share a color:

* T cells
* Natural killer cells
* Conventional dendritic cells
* Monocytes

```{r fig.width = 10, fig.height = 10}
ggplot(umap_df) +
  aes(x = UMAP1, y = UMAP2, color = final_label) +
  geom_point(alpha = 0.2, size = 0.3) +
  scale_color_manual(values = celltype_pal) +
  guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) + 
  umap_theme +
  theme(legend.position = "bottom")
```

#### Faceted by cell type assignment

This UMAP is the same as the previous one, except each facet highlights one specific cell type.
In this plot, only cell types with at least 10 cells are shown.


```{r fig.width = 16, fig.height = 16}
umap_df |>
  dplyr::add_count(final_label) |>
  dplyr::filter(n > 10) |>
  faceted_umap(
    annotation_column = final_label,
    celltype_colors = celltype_pal, 
    facet_wrap_cols = 6
  ) + 
    umap_theme +
    theme(legend.position = "none")
```


#### Colored by tumor classification


```{r fig.width = 8, fig.height = 8}
ggplot(umap_df) +
  aes(x = UMAP1, y = UMAP2, color = cell_class) +
  geom_point(alpha = 0.2, size = 0.3) +
  scale_color_brewer(palette = "Set2") +
  guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) +
  umap_theme
```

#### Colored by PDX status

```{r fig.width = 8, fig.height = 8}
ggplot(umap_df) +
  aes(x = UMAP1, y = UMAP2, color = is_xenograft) +
  geom_point(alpha = 0.2, size = 0.3) +
  scale_color_brewer(palette = "Set1") +
  guides(color = guide_legend(override.aes = list(size = 2, alpha = 1))) +
  umap_theme
```


### Dot plot

This section shows dot plots of marker gene expression for the final annotations.
Specifically, we show the top 5 highest `log2FC` marker genes per NBAtlas cell type for validation.
Marker genes are taken directly from the NBAtlas paper where they were identified with `Seurat::FindMarkers()`. 

Marker genes that are present in three or more cell types are not included.
In addition, the upper limit for the color scale has been censored to a maximum of 6 to ensure visibility of expression levels across all cell types.

```{r}
input_dotplot_df <- final_annotation_df |>
  dplyr::filter(final_label != "Unknown") |>
  # recode cell types to match what is present in NBAtlas marker genes
  dplyr::mutate(
    label_recoded = dplyr::case_when(
      stringr::str_detect(final_label, "monocyte") ~ "Monocyte", 
      stringr::str_detect(final_label, "cDC") ~ "cDC",
      .default = final_label
  )) |>
  dplyr::select(cell_id, label_recoded)

# data frame of top 5 upregulated genes per cell type
top_nbatlas_markers_df <- subset_nbatlas_markers(nbatlas_markers_df, 5, "up") |>
  # only markers in no more than 2 categories
  # this is a good middle ground to ensure there are at least 2 markers per cell type
  dplyr::add_count(ensembl_gene_id) |>
  dplyr::filter(n < 3) |>
  dplyr::select(-n) 
```


```{r}
# get total number of cells per final annotation group and set up y_label
total_cells_df <- input_dotplot_df |>
  dplyr::count(label_recoded, name = "total_cells") |>
  dplyr::mutate(y_label = glue::glue("{label_recoded} ({total_cells})")) |>
  dplyr::inner_join(label_map_df, by = c("label_recoded" = "NBAtlas_label")) |>
  dplyr::group_by(NBAtlas_family) |>
  dplyr::mutate(family_total_cells = sum(total_cells)) |>
  # order by family count, but put Unknown at the end
  dplyr::arrange(desc(family_total_cells)) |>
  # amazing new dplyr trick; only this row is TRUE, so it gets moved to the last row
  dplyr::arrange(label_recoded == "Unknown")

total_cells_df$y_label <- factor(total_cells_df$y_label, levels = rev(total_cells_df$y_label))
nbatlas_bar_order <- total_cells_df$label_recoded
```

```{r}
# Prepare the expressed_genes vector
# we only care about if that gene is expressed otherwise we won't waste memory and include it
gene_sums <- rowData(merged_sce) |>
  as.data.frame() |>
  dplyr::select(contains("detected")) |>
  as.matrix() |>
  rowSums()
expressed_genes <- names(gene_sums)[gene_sums > 0]
```

```{r, fig.width = 32, fig.height = 16, eval = !params$testing}
generate_dotplot(
  merged_sce = merged_sce,
  markers_df = top_nbatlas_markers_df,
  celltype_df = input_dotplot_df,
  total_cells_df = total_cells_df,
  expressed_genes = expressed_genes,
  bar_order = nbatlas_bar_order, 
  celltype_palette = celltype_pal, 
  min_cells = 10 # only show cell types with at least 10 cells
) +
  theme(legend.position = "bottom")
```


## Export

Export the final table with submission-ready column names.

```{r}
final_annotation_df |>
  dplyr::select(
    cell_id, 
    cell_type_assignment = final_label, 
    tumor_cell_classification = cell_class,
    CL_ontology_id = final_id, 
    CL_annotation
  ) |>
  readr::write_tsv(
    params$annotations_tsv
  )
```

## Session Info

```{r}
sessionInfo()
```
